Data

dataset <- read.csv("C:\\Kate\\Research\\Property\\Data\\property_water_claims_non_cat_fs_v5.csv", header=TRUE)

Claim Partial Dependency XGB Classification dataset exported from a Python notebook

pd_dataset <- read.csv("C:\\Kate\\Research\\Property\\Data\\EDA_PartialDependency.csv", header=TRUE)
library(funModeling)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## funModeling v.1.6.5 :)
## Examples and tutorials at livebook.datascienceheroes.com
library(ggplot2)
colnames(dataset)
##   [1] "modeldata_id"                       
##   [2] "systemidstart"                      
##   [3] "systemidend"                        
##   [4] "cal_year"                           
##   [5] "startdate"                          
##   [6] "enddate"                            
##   [7] "startdatetm"                        
##   [8] "enddatetm"                          
##   [9] "ecy"                                
##  [10] "log_ecy"                            
##  [11] "policynumber"                       
##  [12] "policy_uniqueid"                    
##  [13] "policyterm"                         
##  [14] "policytype"                         
##  [15] "effectivedate"                      
##  [16] "expirationdate"                     
##  [17] "policystate"                        
##  [18] "policyform"                         
##  [19] "persistency"                        
##  [20] "companycd"                          
##  [21] "carriercd"                          
##  [22] "agency_group"                       
##  [23] "producername"                       
##  [24] "territory"                          
##  [25] "risknumber"                         
##  [26] "risktype"                           
##  [27] "yearbuilt"                          
##  [28] "log_yearbuilt"                      
##  [29] "sqft"                               
##  [30] "log_sqft"                           
##  [31] "stories"                            
##  [32] "roofcd"                             
##  [33] "roofcd_encd"                        
##  [34] "units"                              
##  [35] "occupancycd"                        
##  [36] "occupancy_encd"                     
##  [37] "allperilded"                        
##  [38] "waterded"                           
##  [39] "protectionclass"                    
##  [40] "constructioncd"                     
##  [41] "constructioncd_encd"                
##  [42] "fire_risk_model_score"              
##  [43] "multipolicyind"                     
##  [44] "multipolicyindumbrella"             
##  [45] "earthquakeumbrellaind"              
##  [46] "usagetype"                          
##  [47] "usagetype_encd"                     
##  [48] "ordinanceorlawpct"                  
##  [49] "functionalreplacementcost"          
##  [50] "homegardcreditind"                  
##  [51] "sprinklersystem"                    
##  [52] "landlordind"                        
##  [53] "rentersinsurance"                   
##  [54] "firealarmtype"                      
##  [55] "burglaryalarmtype"                  
##  [56] "waterdetectiondevice"               
##  [57] "neighborhoodcrimewatchind"          
##  [58] "propertymanager"                    
##  [59] "safeguardplusind"                   
##  [60] "kitchenfireextinguisherind"         
##  [61] "gatedcommunityind"                  
##  [62] "deadboltind"                        
##  [63] "poolind"                            
##  [64] "replacementcostdwellingind"         
##  [65] "replacementvalueind"                
##  [66] "serviceline"                        
##  [67] "equipmentbreakdown"                 
##  [68] "numberoffamilies"                   
##  [69] "insuredage"                         
##  [70] "maritalstatus"                      
##  [71] "insurancescore"                     
##  [72] "overriddeninsurancescore"           
##  [73] "insurancescorevalue"                
##  [74] "insscoretiervalueband"              
##  [75] "financialstabilitytier"             
##  [76] "allcov_wp"                          
##  [77] "cova_wp"                            
##  [78] "cova_ep"                            
##  [79] "cova_deductible"                    
##  [80] "log_cova_deductible"                
##  [81] "cova_limit"                         
##  [82] "log_cova_limit"                     
##  [83] "cova_ic_nc_water"                   
##  [84] "hasclaim"                           
##  [85] "cova_il_nc_water"                   
##  [86] "log_cova_il_nc_water"               
##  [87] "water_risk_3_blk"                   
##  [88] "log_water_risk_3_blk"               
##  [89] "water_risk_fre_3_blk"               
##  [90] "log_water_risk_fre_3_blk"           
##  [91] "water_risk_sev_3_blk"               
##  [92] "log_water_risk_sev_3_blk"           
##  [93] "appl_fail_3_blk"                    
##  [94] "fixture_leak_3_blk"                 
##  [95] "pipe_froze_3_blk"                   
##  [96] "plumb_leak_3_blk"                   
##  [97] "rep_cost_3_blk"                     
##  [98] "ustructure_fail_3_blk"              
##  [99] "waterh_fail_3_blk"                  
## [100] "loaddate"                           
## [101] "customer_cnt_active_policies"       
## [102] "customer_cnt_active_policies_binned"

Quick Overview

#dataset <- dataset[,all]
str(dataset)
## 'data.frame':    2306865 obs. of  102 variables:
##  $ modeldata_id                       : int  926880 369590 371058 366474 367033 430043 1059307 372727 373618 369334 ...
##  $ systemidstart                      : int  2517514 786489 788544 782278 783043 962975 3070559 791048 792228 786104 ...
##  $ systemidend                        : int  2517514 786489 788557 782278 949234 962975 3070559 1190404 1068944 786104 ...
##  $ cal_year                           : int  2015 2012 2010 2011 2011 2011 2015 2012 2011 2011 ...
##  $ startdate                          : Factor w/ 4135 levels "2009-01-01","2009-01-02",..: 2192 1096 628 731 882 997 2436 1096 1044 731 ...
##  $ enddate                            : Factor w/ 4104 levels "2009-01-09 00:00:00",..: 2166 1190 698 754 1063 1063 2524 1262 1063 741 ...
##  $ startdatetm                        : Factor w/ 22990 levels "2008-01-09 00:00:00",..: 11981 4408 3173 2108 4554 5397 13368 4904 5705 2038 ...
##  $ enddatetm                          : Factor w/ 16232 levels "2009-01-09 00:00:00",..: 7522 3965 3045 2253 4065 4530 9644 4258 3939 2205 ...
##  $ ecy                                : num  0.0191 0.3477 0.2819 0.1533 0.5859 ...
##  $ log_ecy                            : num  -3.958 -1.056 -1.266 -1.875 -0.535 ...
##  $ policynumber                       : Factor w/ 243197 levels "AZF0082147","AZF0221975",..: 97 116 152 277 789 585 18 201 220 110 ...
##  $ policy_uniqueid                    : int  823918 365559 366890 362938 363414 439016 931611 368459 369193 365318 ...
##  $ policyterm                         : int  5 2 1 1 2 2 6 2 2 1 ...
##  $ policytype                         : Factor w/ 2 levels "New","Renewal": 2 2 1 1 2 2 2 2 2 1 ...
##  $ effectivedate                      : Factor w/ 4178 levels "2008-01-09","2008-01-14",..: 2206 900 671 465 925 1040 2479 972 1087 452 ...
##  $ expirationdate                     : Factor w/ 4299 levels "2009-01-09","2009-01-14",..: 2327 1022 792 586 1047 1162 2601 1094 1209 573 ...
##  $ policystate                        : Factor w/ 3 levels "AZ","CA","NV": 1 1 1 1 1 1 1 1 1 1 ...
##  $ policyform                         : Factor w/ 9 levels "DF1","DF3","DF6",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ persistency                        : int  13 9 6 4 2 3 15 6 6 8 ...
##  $ companycd                          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ carriercd                          : Factor w/ 2 levels "CSEICO","CSESG": 1 1 1 1 1 1 1 1 1 1 ...
##  $ agency_group                       : Factor w/ 605 levels "","1ST CENTURY INS SVCS INC.",..: 464 374 504 374 579 260 96 533 245 542 ...
##  $ producername                       : Factor w/ 1267 levels "1ST CENTURY INS SVCS INC.",..: 1006 732 1074 732 1217 443 150 1143 423 1152 ...
##  $ territory                          : Factor w/ 10 levels "","AZ-A","AZ-T",..: 2 2 6 2 3 2 6 2 2 2 ...
##  $ risknumber                         : int  1 1 1 1 1 1 2 1 1 1 ...
##  $ risktype                           : Factor w/ 2 levels "Dwelling","Homeowners": 1 1 1 1 1 1 1 1 1 1 ...
##  $ yearbuilt                          : int  1977 1960 2001 1995 2005 1979 1994 2003 1983 1986 ...
##  $ log_yearbuilt                      : num  7.59 7.58 7.6 7.6 7.6 ...
##  $ sqft                               : int  1600 1200 1600 1500 1300 2100 1400 1500 4000 1300 ...
##  $ log_sqft                           : num  7.41 7.09 7.4 7.37 7.17 ...
##  $ stories                            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ roofcd                             : Factor w/ 7 levels "COMPO","MEMBRANE",..: 1 1 6 1 6 1 1 1 4 1 ...
##  $ roofcd_encd                        : int  8 8 7 8 7 8 8 8 6 8 ...
##  $ units                              : int  1 1 1 1 1 1 1 1 4 1 ...
##  $ occupancycd                        : Factor w/ 3 levels "NO","OCCUPIEDNOW",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ occupancy_encd                     : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ allperilded                        : int  250 500 500 500 2500 500 500 500 500 500 ...
##  $ waterded                           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ protectionclass                    : int  3 2 4 4 6 2 3 3 2 3 ...
##  $ constructioncd                     : Factor w/ 5 levels "AF","B","F","M",..: 3 4 3 3 3 4 3 3 3 3 ...
##  $ constructioncd_encd                : int  5 1 5 5 5 1 5 5 5 5 ...
##  $ fire_risk_model_score              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ multipolicyind                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ multipolicyindumbrella             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ earthquakeumbrellaind              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ usagetype                          : Factor w/ 7 levels "COC","PRIMARY",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ usagetype_encd                     : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ ordinanceorlawpct                  : int  10 10 10 10 10 10 10 10 10 10 ...
##  $ functionalreplacementcost          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ homegardcreditind                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ sprinklersystem                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ landlordind                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ rentersinsurance                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ firealarmtype                      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ burglaryalarmtype                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ waterdetectiondevice               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ neighborhoodcrimewatchind          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ propertymanager                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ safeguardplusind                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ kitchenfireextinguisherind         : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ gatedcommunityind                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ deadboltind                        : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ poolind                            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ replacementcostdwellingind         : int  1 0 1 1 1 1 0 1 1 1 ...
##  $ replacementvalueind                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ serviceline                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ equipmentbreakdown                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ numberoffamilies                   : int  1 1 1 1 1 1 1 1 4 1 ...
##  $ insuredage                         : int  NA NA NA NA 52 66 NA NA NA NA ...
##  $ maritalstatus                      : Factor w/ 5 levels "~","Divorced",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ insurancescore                     : Factor w/ 3836 levels "(DOES","~","610",..: 2 2 2 2 2 11 2 2 2 2 ...
##  $ overriddeninsurancescore           : Factor w/ 41 levels "~","01","02",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ insurancescorevalue                : Factor w/ 92 levels "~","630","645",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ insscoretiervalueband              : Factor w/ 22 levels "~","624-632",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ financialstabilitytier             : Factor w/ 42 levels "","~","01","02",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ allcov_wp                          : num  564 446 460 494 422 ...
##  $ cova_wp                            : num  471 418 376 437 364 ...
##  $ cova_ep                            : num  9 145 106 67 213 ...
##  $ cova_deductible                    : int  250 500 500 500 2500 500 500 500 500 500 ...
##  $ log_cova_deductible                : num  5.52 6.21 6.21 6.21 7.82 ...
##  $ cova_limit                         : int  200000 200000 200000 200000 200000 300000 200000 300000 500000 200000 ...
##  $ log_cova_limit                     : num  12.1 12.2 12 12.2 12.2 ...
##  $ cova_ic_nc_water                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hasclaim                           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cova_il_nc_water                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ log_cova_il_nc_water               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ water_risk_3_blk                   : int  168 201 222 233 266 215 191 184 172 243 ...
##  $ log_water_risk_3_blk               : num  5.12 5.3 5.4 5.45 5.58 ...
##  $ water_risk_fre_3_blk               : int  209 243 218 262 214 257 187 200 242 209 ...
##  $ log_water_risk_fre_3_blk           : num  5.34 5.49 5.38 5.57 5.37 ...
##  $ water_risk_sev_3_blk               : int  83 85 105 92 128 86 105 95 73 119 ...
##  $ log_water_risk_sev_3_blk           : num  4.42 4.44 4.65 4.52 4.85 ...
##  $ appl_fail_3_blk                    : int  5 5 5 5 5 5 5 5 4 5 ...
##  $ fixture_leak_3_blk                 : int  1 1 3 3 2 3 1 1 5 1 ...
##  $ pipe_froze_3_blk                   : int  2 0 0 0 0 0 3 2 0 3 ...
##  $ plumb_leak_3_blk                   : int  4 5 1 4 4 4 5 5 5 4 ...
##  $ rep_cost_3_blk                     : int  1 5 5 4 5 4 0 1 4 0 ...
##  $ ustructure_fail_3_blk              : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ waterh_fail_3_blk                  : int  2 3 3 3 3 3 1 3 2 4 ...
##   [list output truncated]
summary(dataset)
##   modeldata_id     systemidstart      systemidend         cal_year   
##  Min.   :      1   Min.   :      2   Min.   :      3   Min.   :2009  
##  1st Qu.: 499011   1st Qu.:1131009   1st Qu.:1197320   1st Qu.:2012  
##  Median : 990532   Median :2753282   Median :2865110   Median :2015  
##  Mean   :1008134   Mean   :3472157   Mean   :3576917   Mean   :2015  
##  3rd Qu.:1516564   3rd Qu.:5616055   3rd Qu.:5805417   3rd Qu.:2018  
##  Max.   :2175195   Max.   :9614887   Max.   :9617520   Max.   :2020  
##                                                                      
##       startdate                      enddate       
##  2019-01-01: 128258   2019-01-01 00:00:00: 128231  
##  2018-01-01: 125314   2018-01-01 00:00:00: 125249  
##  2020-01-01: 106826   2020-01-01 00:00:00: 106916  
##  2017-01-01: 106306   2017-01-01 00:00:00: 106196  
##  2013-01-01:  97750   2013-01-01 00:00:00:  97762  
##  2014-01-01:  96383   2014-01-01 00:00:00:  96396  
##  (Other)   :1646028   (Other)            :1646115  
##               startdatetm                    enddatetm      
##  2018-06-01 00:00:00:   1848   2019-06-01 00:00:00:   2207  
##  2018-07-01 00:00:00:   1703   2018-06-01 00:00:00:   2110  
##  2017-09-01 00:00:00:   1695   2019-07-01 00:00:00:   2088  
##  2017-07-01 00:00:00:   1693   2018-07-01 00:00:00:   2044  
##  2017-06-01 00:00:00:   1575   2019-08-01 00:00:00:   2002  
##  2018-08-01 00:00:00:   1572   2018-09-01 00:00:00:   1977  
##  (Other)            :2296779   (Other)            :2294437  
##       ecy            log_ecy              policynumber    
##  Min.   :0.0027   Min.   :-5.914504   CAF0389924:    201  
##  1st Qu.:0.2381   1st Qu.:-1.435065   CAF0461789:    184  
##  Median :0.4462   Median :-0.806988   CAF0475516:    154  
##  Mean   :0.4672   Mean   :-1.054233   CAF0393082:    151  
##  3rd Qu.:0.6981   3rd Qu.:-0.359393   CAF0464778:    132  
##  Max.   :1.0020   Max.   : 0.001998   CAF0468201:    110  
##                                       (Other)   :2305933  
##  policy_uniqueid     policyterm       policytype         effectivedate    
##  Min.   :      1   Min.   : 1.000   New    : 526336   2018-06-01:   2239  
##  1st Qu.: 452694   1st Qu.: 2.000   Renewal:1780529   2017-07-01:   2169  
##  Median : 871909   Median : 3.000                     2018-07-01:   2119  
##  Mean   : 876050   Mean   : 3.889                     2017-06-01:   2114  
##  3rd Qu.:1308425   3rd Qu.: 6.000                     2017-09-01:   2114  
##  Max.   :1832702   Max.   :13.000                     2018-08-01:   2059  
##                                                       (Other)   :2294051  
##     expirationdate    policystate        policyform       persistency     
##  2019-06-01:   2239   AZ: 165112   HO3        :1225988   Min.   :  0.000  
##  2018-07-01:   2167   CA:2060228   DF3        : 905154   1st Qu.:  1.000  
##  2019-07-01:   2118   NV:  81525   DF6        :  79589   Median :  3.000  
##  2018-06-01:   2117                Form3      :  48635   Mean   :  6.204  
##  2018-09-01:   2114                FL1-Vacant :  23766   3rd Qu.:  9.000  
##  2019-08-01:   2053                FL3-Special:  12342   Max.   :103.000  
##  (Other)   :2294057                (Other)    :  11391                    
##    companycd      carriercd      
##  Min.   : 1.00   CSEICO: 799903  
##  1st Qu.: 1.00   CSESG :1506962  
##  Median :17.00                   
##  Mean   :12.02                   
##  3rd Qu.:17.00                   
##  Max.   :19.00                   
##                                  
##                                        agency_group    
##  WESTERN GOLD INS AGCY INC.                  : 242950  
##  J.E. BROWN and ASSOCS INS SVCS              :  93954  
##  CRUSBERG DECKER INS SVCS INC                :  77549  
##  PIIB - PACIFIC INTERSTATE INS               :  73917  
##  ISU INSURANCE SERVICES OF  SAN FRANCISCO INC:  66214  
##  Acrisure of California                      :  58796  
##  (Other)                                     :1693485  
##                          producername       territory     
##  WESTERN GOLD INS AGCY INC.    : 240992   CA-B   :684831  
##  J.E. BROWN and ASSOCS INS SVCS:  89662   CA-C   :488239  
##  CRUSBERG DECKER INS SVCS INC  :  52069   CA-O   :399153  
##  NATIONAL INSURANCE SOLUTIONS  :  31965   CA-A   :347771  
##  BICHLMEIER INSURANCE SRVS INC :  28097   CA-T   :153736  
##  Acrisure of California LLC    :  27745   AZ-A   :146637  
##  (Other)                       :1836335   (Other): 86498  
##    risknumber           risktype         yearbuilt    log_yearbuilt  
##  Min.   : 0.000   Dwelling  :1032242   Min.   :1900   Min.   :7.523  
##  1st Qu.: 1.000   Homeowners:1274623   1st Qu.:1959   1st Qu.:7.580  
##  Median : 1.000                        Median :1979   Median :7.590  
##  Mean   : 1.005                        Mean   :1976   Mean   :7.588  
##  3rd Qu.: 1.000                        3rd Qu.:1996   3rd Qu.:7.599  
##  Max.   :16.000                        Max.   :2019   Max.   :7.610  
##                                                                      
##       sqft         log_sqft        stories           roofcd       
##  Min.   : 800   Min.   :6.397   Min.   :1.000   COMPO   :1143155  
##  1st Qu.:1300   1st Qu.:7.182   1st Qu.:1.000   MEMBRANE:  29439  
##  Median :1700   Median :7.441   Median :1.000   METAL   :   6126  
##  Mean   :1872   Mean   :7.471   Mean   :1.183   OTHER   : 270319  
##  3rd Qu.:2300   3rd Qu.:7.749   3rd Qu.:1.000   TAR     :  54264  
##  Max.   :5000   Max.   :9.210   Max.   :3.000   TILE    : 781454  
##                                                 WOOD    :  22108  
##   roofcd_encd        units           occupancycd      occupancy_encd 
##  Min.   :1.000   Min.   :1.00   NO         :      5   Min.   :1.000  
##  1st Qu.:7.000   1st Qu.:1.00   OCCUPIEDNOW:2139270   1st Qu.:1.000  
##  Median :7.000   Median :1.00   TENANT     : 167590   Median :1.000  
##  Mean   :7.216   Mean   :1.12                         Mean   :1.073  
##  3rd Qu.:8.000   3rd Qu.:1.00                         3rd Qu.:1.000  
##  Max.   :8.000   Max.   :4.00                         Max.   :3.000  
##                                                                      
##   allperilded       waterded       protectionclass constructioncd 
##  Min.   :    0   Min.   :    0.0   Min.   : 0.00   AF   : 824017  
##  1st Qu.: 1000   1st Qu.:    0.0   1st Qu.: 2.00   B    :  23379  
##  Median : 1000   Median :    0.0   Median : 3.00   F    :1412361  
##  Mean   : 1361   Mean   :   70.1   Mean   : 3.02   M    :  25734  
##  3rd Qu.: 2500   3rd Qu.:    0.0   3rd Qu.: 4.00   OTHER:  21374  
##  Max.   :10000   Max.   :10000.0   Max.   :10.00                  
##                                                                   
##  constructioncd_encd fire_risk_model_score multipolicyind  
##  Min.   :1.00        Min.   :-1.0000       Min.   :0.0000  
##  1st Qu.:4.00        1st Qu.: 0.0000       1st Qu.:0.0000  
##  Median :5.00        Median : 0.0000       Median :0.0000  
##  Mean   :4.55        Mean   : 0.2252       Mean   :0.1658  
##  3rd Qu.:5.00        3rd Qu.: 0.0000       3rd Qu.:0.0000  
##  Max.   :5.00        Max.   :18.0000       Max.   :1.0000  
##                                                            
##  multipolicyindumbrella earthquakeumbrellaind      usagetype      
##  Min.   :0.000000       Min.   :0.000000      COC       :  12076  
##  1st Qu.:0.000000       1st Qu.:0.000000      PRIMARY   :1338879  
##  Median :0.000000       Median :0.000000      RENTAL    : 929952  
##  Mean   :0.004352       Mean   :0.003712      SEASONAL  :  10109  
##  3rd Qu.:0.000000       3rd Qu.:0.000000      SECONDARY :   4145  
##  Max.   :1.000000       Max.   :1.000000      UNOCCUPIED:    356  
##                                               VACANT    :  11348  
##  usagetype_encd  ordinanceorlawpct functionalreplacementcost
##  Min.   :1.000   Min.   :  0.000   Min.   :0.000000         
##  1st Qu.:6.000   1st Qu.:  0.000   1st Qu.:0.000000         
##  Median :7.000   Median : 10.000   Median :0.000000         
##  Mean   :6.544   Mean   :  9.308   Mean   :0.001543         
##  3rd Qu.:7.000   3rd Qu.: 10.000   3rd Qu.:0.000000         
##  Max.   :7.000   Max.   :100.000   Max.   :1.000000         
##                                                             
##  homegardcreditind sprinklersystem    landlordind      rentersinsurance  
##  Min.   :0.000     Min.   :0.00000   Min.   :0.00000   Min.   :0.000000  
##  1st Qu.:0.000     1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.000000  
##  Median :0.000     Median :0.00000   Median :0.00000   Median :0.000000  
##  Mean   :0.141     Mean   :0.03073   Mean   :0.06455   Mean   :0.004432  
##  3rd Qu.:0.000     3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.000000  
##  Max.   :1.000     Max.   :1.00000   Max.   :1.00000   Max.   :1.000000  
##                                                                          
##  firealarmtype    burglaryalarmtype waterdetectiondevice
##  Min.   :0.0000   Min.   :0.0000    Min.   :0.0000000   
##  1st Qu.:0.0000   1st Qu.:0.0000    1st Qu.:0.0000000   
##  Median :1.0000   Median :0.0000    Median :0.0000000   
##  Mean   :0.6033   Mean   :0.3378    Mean   :0.0001647   
##  3rd Qu.:1.0000   3rd Qu.:1.0000    3rd Qu.:0.0000000   
##  Max.   :1.0000   Max.   :1.0000    Max.   :1.0000000   
##                                                         
##  neighborhoodcrimewatchind propertymanager   safeguardplusind
##  Min.   :0.00000           Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.00000           1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.00000           Median :0.00000   Median :0.0000  
##  Mean   :0.01469           Mean   :0.01574   Mean   :0.3547  
##  3rd Qu.:0.00000           3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :1.00000           Max.   :1.00000   Max.   :1.0000  
##                                                              
##  kitchenfireextinguisherind gatedcommunityind  deadboltind    
##  Min.   :0.0000             Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.0000             1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.0000             Median :0.00000   Median :1.0000  
##  Mean   :0.4405             Mean   :0.01349   Mean   :0.7197  
##  3rd Qu.:1.0000             3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :1.0000             Max.   :1.00000   Max.   :1.0000  
##                                                               
##     poolind        replacementcostdwellingind replacementvalueind
##  Min.   :0.00000   Min.   :0.0000             Min.   :0.00000    
##  1st Qu.:0.00000   1st Qu.:0.0000             1st Qu.:0.00000    
##  Median :0.00000   Median :0.0000             Median :0.00000    
##  Mean   :0.03309   Mean   :0.3304             Mean   :0.01724    
##  3rd Qu.:0.00000   3rd Qu.:1.0000             3rd Qu.:0.00000    
##  Max.   :1.00000   Max.   :1.0000             Max.   :1.00000    
##                                                                  
##   serviceline     equipmentbreakdown numberoffamilies   insuredage    
##  Min.   :0.0000   Min.   :0.0000     Min.   :0.00     Min.   :  0.00  
##  1st Qu.:0.0000   1st Qu.:0.0000     1st Qu.:1.00     1st Qu.: 46.00  
##  Median :0.0000   Median :0.0000     Median :1.00     Median : 56.00  
##  Mean   :0.1005   Mean   :0.1045     Mean   :1.12     Mean   : 56.19  
##  3rd Qu.:0.0000   3rd Qu.:0.0000     3rd Qu.:1.00     3rd Qu.: 66.00  
##  Max.   :1.0000   Max.   :1.0000     Max.   :4.00     Max.   :177.00  
##                                                       NA's   :269594  
##   maritalstatus     insurancescore    overriddeninsurancescore
##  ~       :1041080   ~      :2213757   ~      :2157007         
##  Divorced:   9456   99     :    495   99     :  48119         
##  Married : 613978   KQXKD  :    155   07     :  10322         
##  Single  : 619428   (DOES  :    136   12     :   8445         
##  Widowed :  22923   UXKQS  :    114   04     :   6887         
##                     ZKKWS  :    113   09     :   6270         
##                     (Other):  92095   (Other):  69815         
##  insurancescorevalue insscoretiervalueband financialstabilitytier
##  ~      :2306585     ~      :2306585              :2130960       
##  825    :     13     865-880:     49       ~      : 164611       
##  875    :     12     837-865:     24       07     :   1070       
##  769    :      8     748-774:     23       12     :    906       
##  872    :      8     894-945:     23       13     :    693       
##  902    :      8     820-837:     22       04     :    686       
##  (Other):    231     (Other):    139       (Other):   7939       
##    allcov_wp          cova_wp           cova_ep          cova_deductible
##  Min.   :   32.0   Min.   :    5.0   Min.   :    0.049   Min.   :    0  
##  1st Qu.:  550.0   1st Qu.:  477.0   1st Qu.:  143.359   1st Qu.: 1000  
##  Median :  774.0   Median :  771.0   Median :  311.837   Median : 1000  
##  Mean   :  903.6   Mean   :  988.5   Mean   :  464.999   Mean   : 1361  
##  3rd Qu.: 1091.0   3rd Qu.: 1256.0   3rd Qu.:  605.821   3rd Qu.: 2500  
##  Max.   :18926.0   Max.   :14620.0   Max.   :12121.894   Max.   :10000  
##                                                                         
##  log_cova_deductible   cova_limit      log_cova_limit   cova_ic_nc_water  
##  Min.   :0.000       Min.   : 100000   Min.   : 8.294   Min.   :0.000000  
##  1st Qu.:6.908       1st Qu.: 300000   1st Qu.:12.385   1st Qu.:0.000000  
##  Median :6.908       Median : 400000   Median :12.695   Median :0.000000  
##  Mean   :6.994       Mean   : 418954   Mean   :12.687   Mean   :0.006261  
##  3rd Qu.:7.824       3rd Qu.: 500000   3rd Qu.:13.024   3rd Qu.:0.000000  
##  Max.   :9.210       Max.   :1300000   Max.   :14.944   Max.   :3.000000  
##                                                                           
##     hasclaim       cova_il_nc_water   log_cova_il_nc_water
##  Min.   :0.00000   Min.   : -5536.9   Min.   :-0.10536    
##  1st Qu.:0.00000   1st Qu.:     0.0   1st Qu.: 0.00000    
##  Median :0.00000   Median :     0.0   Median : 0.00000    
##  Mean   :0.00613   Mean   :    83.1   Mean   : 0.05379    
##  3rd Qu.:0.00000   3rd Qu.:     0.0   3rd Qu.: 0.00000    
##  Max.   :1.00000   Max.   :522735.2   Max.   :13.16683    
##                                       NA's   :3           
##  water_risk_3_blk log_water_risk_3_blk water_risk_fre_3_blk
##  Min.   :  21.0   Min.   :3.045        Min.   :  19.0      
##  1st Qu.: 135.0   1st Qu.:4.905        1st Qu.: 108.0      
##  Median : 185.0   Median :5.220        Median : 154.0      
##  Mean   : 201.5   Mean   :5.199        Mean   : 168.4      
##  3rd Qu.: 243.0   3rd Qu.:5.493        3rd Qu.: 208.0      
##  Max.   :1491.0   Max.   :7.307        Max.   :2308.0      
##                                                            
##  log_water_risk_fre_3_blk water_risk_sev_3_blk log_water_risk_sev_3_blk
##  Min.   :2.944            Min.   : 33.0        Min.   :3.497           
##  1st Qu.:4.682            1st Qu.:106.0        1st Qu.:4.663           
##  Median :5.037            Median :125.0        Median :4.828           
##  Mean   :5.011            Mean   :127.7        Mean   :4.821           
##  3rd Qu.:5.338            3rd Qu.:147.0        3rd Qu.:4.990           
##  Max.   :7.744            Max.   :313.0        Max.   :5.746           
##                                                                        
##  appl_fail_3_blk fixture_leak_3_blk pipe_froze_3_blk plumb_leak_3_blk
##  Min.   :0.00    Min.   :0.000      Min.   :0.000    Min.   :0.000   
##  1st Qu.:4.00    1st Qu.:1.000      1st Qu.:2.000    1st Qu.:1.000   
##  Median :5.00    Median :2.000      Median :2.000    Median :4.000   
##  Mean   :4.13    Mean   :1.882      Mean   :1.746    Mean   :3.068   
##  3rd Qu.:5.00    3rd Qu.:3.000      3rd Qu.:2.000    3rd Qu.:4.000   
##  Max.   :5.00    Max.   :5.000      Max.   :5.000    Max.   :5.000   
##                                                                      
##  rep_cost_3_blk  ustructure_fail_3_blk waterh_fail_3_blk
##  Min.   :0.000   Min.   :0.000         Min.   :0.00     
##  1st Qu.:5.000   1st Qu.:5.000         1st Qu.:0.00     
##  Median :5.000   Median :5.000         Median :1.00     
##  Mean   :4.801   Mean   :4.435         Mean   :1.16     
##  3rd Qu.:5.000   3rd Qu.:5.000         3rd Qu.:2.00     
##  Max.   :5.000   Max.   :5.000         Max.   :5.00     
##                                                         
##                    loaddate       customer_cnt_active_policies
##  2020-07-29 05:01:19.56:2306865   Min.   :  1.00              
##                                   1st Qu.:  1.00              
##                                   Median :  1.00              
##                                   Mean   :  1.69              
##                                   3rd Qu.:  1.00              
##                                   Max.   :147.00              
##                                                               
##  customer_cnt_active_policies_binned
##  Min.   :  1.000                    
##  1st Qu.:  1.000                    
##  Median :  1.000                    
##  Mean   :  2.806                    
##  3rd Qu.:  1.000                    
##  Max.   :150.000                    
## 

Visualization

Number of claims

dataset$cova_ic_nc_water_color <- as.factor(dataset$cova_ic_nc_water)
ggplot(dataset, aes(x = cova_ic_nc_water, fill=cova_ic_nc_water_color)) +
     geom_bar() +
     labs(x = 'Number of Claims', y = 'Count', title = 'Histogram of Number of Claims') +
     scale_fill_manual("legend", values = c("0" = "#56B4E9", "1" = "red", "2" = "red", "3" = "red")) + 
     geom_text(stat='count', aes(label=..count..), vjust=1)

Since the cases when there are more then 1 claim per exposure are very rare and, I create a new logical attribute HasClaim with values 1 or 0. It can be used in logistic regression but I use it visualize if there is any visual dependency between predictors and claims.

dataset$hasclaim <- as.factor(dataset$hasclaim)
ggplot(dataset, aes(x = hasclaim, fill=hasclaim)) +
     geom_bar() +
     scale_fill_manual("legend", values = c("0" = "#56B4E9", "1" = "red")) + 
     labs(x = 'Adjusted Number of Claims', y = 'Count', title = 'Adjusted Histogram of Number of Claims') +
geom_text(stat='count', aes(label=..count..), vjust=1)

There is cery low percent of water related claims: 0.6%

Predictors

(the order is from XGB Classification feature importance)

ratio <- nrow(dataset)
#grid for multiplots
multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) {
  require(grid)

  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  if (is.null(layout)) {
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

  if (numPlots == 1) {
    print(plots[[1]])

  } else {
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    for (i in 1:numPlots) {
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}
#continuous attributes
plot_continuous <- function (col_name) {
  p1 <- ggplot(dataset, aes(x = .data[[col_name]], fill=hasclaim)) +
  geom_histogram(bins=100) +
  scale_fill_manual(breaks = c("0","1"), 
                       values=c("#56B4E9","red")) +   
    
  labs(x = col_name, y = 'Count', title = paste("Histogram of", col_name))

 #p2 <- ggplot(dataset, aes(x = .data[[col_name]])) +
  #geom_density() + 
 # labs(x = col_name, y = 'Density', title = paste("Density of", col_name))
  
  pd_col <- pd_dataset[pd_dataset$feature == col_name,][c('value','pd')]
  
  p2 <- ggplot(pd_col, aes(x = value, y = pd)) +
  geom_line(aes(color="darkred")) + 
 labs(x = col_name, y = 'pd', title = paste("Claim Partial Dependency of", col_name))

  p3 <- ggplot(dataset, aes(x=hasclaim, y=.data[[col_name]], col=hasclaim, fill=hasclaim)) +
  geom_boxplot(notch = TRUE) +
  scale_fill_manual(breaks = c("0","1"), 
                      values=c("#56B4E9","red")) +   
  theme(legend.position = "none") +
  labs(y = col_name, title = paste("Box Plot of", col_name, "with hasclaim"))

  vec <- dataset[[col_name]]
  y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
  x <- qnorm(c(0.25, 0.75))
  slope <- diff(y)/diff(x)
  int <- y[1L] - slope * x[1L]

  p4 <- ggplot(dataset, aes(sample = .data[[col_name]], col='red')) + 
  stat_qq() + 
  geom_abline(slope = slope, intercept = int) +
  theme(legend.position = "none") +
  labs(y = col_name, title = paste("QQ Plot of", col_name))

  
  multiplot(p1,p2,p3,p4, cols=2)
}
#categorical attributes - multiplot does not work for fun modeling
plot_categorical <- function(col_name) {
  
  if ( col_name == 'roofcd_encd') {
col_name_original <- 'roofcd'
} 
  else if ( col_name == 'usagetype_encd' ) {
col_name_original <- 'usagetype'
} 
else if ( col_name == 'constructioncd_encd' ) {
col_name_original <- 'constructioncd'
} 
  else if ( col_name == 'occupancy_encd' ) {
col_name_original <- 'occupancycd'
} 
  else {
  col_name_original <- col_name
}
  
  
pd_col <- pd_dataset[pd_dataset$feature == col_name,][c('value','pd')]
colnames(pd_col) <- c(col_name,'pd')

df <- merge(dataset[c(col_name,'hasclaim')],pd_col,by=col_name)  



if (grepl('encd',col_name) | 
    col_name =='fire_risk_model_score' | 
    col_name =='customer_cnt_active_policies_binned' |
    col_name =='cova_deductible' | 
    col_name =='cova_limit' | 
    col_name =='protectionclass' | 
    col_name =='ordinanceorlawpct' | 
    col_name =='numberoffamilies' | 
    col_name =='waterded' | 
    col_name =='units' | 
    col_name =='stories') {
  XBreaks <- as.vector(unlist(unique(dataset[c(col_name)])))
  XLabels <- as.vector(unlist(unique(dataset[c(col_name_original)]))) 
} 
else if (col_name =='pipe_froze_3_blk' | 
col_name =='water_risk_3_blk' | 
col_name =='ustructure_fail_3_blk' | 
col_name =='water_risk_fre_3_blk' | 
col_name =='waterh_fail_3_blk' | 
col_name =='rep_cost_3_blk' | 
col_name =='plumb_leak_3_blk' | 
col_name =='appl_fail_3_blk' | 
col_name =='fixture_leak_3_blk') {
  XBreaks <- c(0,1,2,3,4,5)
  XLabels <- c('Low', 'Elevated', 'Below Avg', 'Average', 'High', 'Highest')    
}
else {
  XBreaks <- c(0,1)
  XLabels <- c("No", "Yes")  

}

cols <- c('PD'='#f04546')
p1 = ggplot(df) +
     geom_bar(aes(x = .data[[col_name]], fill=hasclaim)) +
     scale_fill_manual(breaks = c("0","1"), 
                       values=c("#56B4E9","red")) +   
     geom_line(aes(x = .data[[col_name]], y=(pd)*ratio*100,color='PD')) +
     scale_y_continuous(sec.axis = sec_axis(~./ratio, name = "PD")) +
     scale_x_continuous( breaks=XBreaks, labels= XLabels) +
     labs(x = col_name, y = 'Count', title = paste("Histogram and claims partial dependency of", col_name_original)) +
     theme(axis.text.x = element_text(angle = 90, hjust = 1))


if (length(unique(dataset[[col_name_original]]))  <= 6) 
{p2 = bayesian_plot(data=dataset, input=col_name_original, target='hasclaim')}

p3 = cross_plot(data=dataset, str_input=col_name_original, str_target='hasclaim',plot_type='percentual')


if (length(unique(dataset[[col_name_original]]))  <= 6) {
  print(p1)
  print(p2)
  print(p3)
  }
else {  
  print(p1)
  print(p3)
  }
  
}
plot_categorical('usagetype_encd')  

## NULL

The more property is used, the higher claims rate.

plot_categorical('customer_cnt_active_policies_binned')  

## NULL

If the same customer has more then 10 active policies, the claim rate is lower.

plot_continuous('ecy')  
## Loading required package: grid

The longer the exposure, the higher claim rate.

plot_categorical('cova_deductible')  

## NULL

The claim rate is higher in low deductible policies.

plot_continuous('yearbuilt')  

More claims are in newer houses but not in modern.

plot_categorical('landlordind')  

## NULL

This is a discount based on the number of policies for the same customer. It’s correlated with customer_cnt_active_policies_binned and has the same claim dependency but without details: more policies less claims rate.

plot_categorical('pipe_froze_3_blk')  

## NULL

Everything indicates we have more claims in the lowest category. It may makes sense for california, where we have most insured properties in an area where low tempretures are rare but not useful for further analysis.

plot_categorical('roofcd_encd')  

## NULL

Visible higher claim rate in WOOD, TILE, and maybe, TAR and OTHER

plot_continuous('sqft')  

The higher sqft, the higher claim rate till some limit, where it is not increased.

plot_categorical('firealarmtype')  

## NULL

Firealarmtype is not directly related to water claims. There are some evidence there is a dependency but, probably, due to correlations with other attributes. Adding the predictor to a model increase overfitting.

plot_categorical('stories')  

## NULL

Theer is a clear evidence, the 2-stories properties have more claims. However, the attributes is broken in the database. There are a lot of empty or high values (known issues).

plot_categorical('equipmentbreakdown')  

## NULL

Equipmentbreakdown is not directly related to water claims. There are some evidence there is a dependency but, probably, due to correlations with other attributes. Adding the predictor to a model increase overfitting.

plot_categorical('cova_limit')  

## NULL

More claims from more expensive properties.

plot_categorical('replacementvalueind')  

## NULL

Replacementvalueind is not directly related to water claims. There are some evidence there is a dependency but, probably, due to correlations with other attributes.

plot_categorical('propertymanager')  

## NULL

Even if PropertyManager decrease number of claims according to visual analysis it is not clear from the partial dependency. Probably due to correlations.

plot_categorical('multipolicyind')  

## NULL

On the one hand, there are more claims in “Yes” multipolicyind category, on the other, it’s different in the partial dependency. The predictor is not very significant in GLM

plot_categorical('poolind')  

## NULL

More claims in properties with pools according to the charts and baysian comparizon but it’s different in the partial dependency. GLM results are more close to the visual and significant. The difference between “No” and “Yes” categories is very small and can be due to errors.

plot_categorical('replacementcostdwellingind')  

## NULL

Replacementcostdwellingind is not directly related to water claims. There are some evidence there is a dependency but, probably, due to correlations with other attributes. There is almost no difference between categories in the partial dependency.

plot_categorical('safeguardplusind')  

## NULL

Safeguardplusind is not directly related to water claims. There are some evidence there is a dependency but, probably, due to correlations with other attributes. There is almost no difference between categories in the partial dependency and charts.

plot_categorical('ustructure_fail_3_blk')  

## NULL

There are slight increase in the “Below Average” category partial dependency but not in visual.

plot_continuous('water_risk_3_blk')  

The higher the score, the more claims according to box-plots and partial dependency.

The rest of the predictors are not very important or directly related to water claims.

plot_categorical('serviceline')  

## NULL
plot_categorical('protectionclass')  

## NULL
plot_continuous('water_risk_fre_3_blk')  

plot_categorical('rep_cost_3_blk')  

## NULL
plot_categorical('waterh_fail_3_blk')  

## NULL
plot_categorical('deadboltind')  

## NULL
plot_categorical('homegardcreditind')  

## NULL
plot_categorical('ordinanceorlawpct')  

## NULL
plot_categorical('occupancy_encd')  

## NULL
plot_categorical('burglaryalarmtype')  

## NULL
plot_categorical('waterded')  

## NULL
plot_categorical('plumb_leak_3_blk')  

## NULL
plot_categorical('appl_fail_3_blk')  

## NULL
plot_categorical('numberoffamilies')  

## NULL
plot_categorical('units')  

## NULL
plot_continuous('water_risk_sev_3_blk')  

plot_categorical('multipolicyindumbrella')  

## NULL
plot_categorical('kitchenfireextinguisherind')  

## NULL
plot_categorical('fixture_leak_3_blk')  

## NULL
plot_categorical('constructioncd_encd')  

## NULL
plot_categorical('fire_risk_model_score')  

## NULL
plot_categorical('gatedcommunityind')  

## NULL
plot_categorical('sprinklersystem')  

## NULL
plot_categorical('neighborhoodcrimewatchind')  

## NULL
plot_categorical('rentersinsurance')  

## NULL
plot_categorical('earthquakeumbrellaind')  

## NULL
plot_categorical('functionalreplacementcost')  

## NULL

Misc visualization

ggplot(data=dataset, mapping = aes(x=yearbuilt, y=sqft, color=hasclaim)) +
  geom_point() +
  scale_color_manual(breaks = c("0","1"), 
                       values=c("#56B4E9","red")) + 
  scale_fill_manual(breaks = c("0","1"), 
                       values=c("#56B4E9","red")) +
  labs(title = "Sqft vs YearBuilt and HasClaim")

We have more policies between 1945 and 2010 with more claims in this period.

ggplot(dataset, aes(x=roofcd, y=yearbuilt, col=hasclaim, fill=hasclaim)) +
  geom_boxplot(notch = TRUE) +
#scale_color_manual(breaks = c("0","1"), 
#                 values=c("#56B4E9","red")) + 
scale_fill_manual(breaks = c("0","1"), 
                       values=c("#56B4E9","red")) +
  theme(legend.position = "none") +
  labs(x = "Roof Code", title = paste("Box Plot of YearBuilt vs Roof Code"))

Tile and wood is used in more modern houses with lower rate of claims. TAR is used in older. COMPO, TAR and OTHER have more claims then other